import bamboolib as bam
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as mpl
df = pd.read_csv("Datasets/D1-1.csv")
df = df.loc[~(df['race'].isin(['Overall_totals']))]
df = df.loc[~(df['race'].isna())]
df = df.loc[~(df['job_category'].isin(['Previous_totals']))]
df['countInt'] = pd.to_numeric(df['count'], downcast='integer', errors='coerce')
df_totals = df.loc[df['job_category'].isin(['Totals'])]
tmp_groupby_df = df_totals.groupby(['company']).agg(**{"'countInt_sum'": ('countInt', 'sum')}).reset_index()
df_totals_1 = df_totals.merge(tmp_groupby_df, on=['company'])
df_company_totals = df_totals_1[['company', "'countInt_sum'"]]
df_company_totals = df_company_totals.drop_duplicates(subset=None, keep='first', inplace=False)
df_company_totals
df_company_totals = df_company_totals.rename(columns={'company': 'company', "'countInt_sum'": 'countInt_sum'})
df['percentage'] =""
#df.loc[df.company == df_company_totals.company, 'percentage'] = df_company_totals.loc['countInt_sum']
for index in df.index:
for company in df_company_totals.index:
if df_company_totals.loc[company,'company'] == df.loc[index,'company']:
df.loc[index,'percentage'] = df.loc[index,'countInt'].item()/df_company_totals.loc[company,'countInt_sum'].item()
df.loc[df['job_category'].isin(['Executive/Senior officials & Mgrs']), 'job_category'] = 'Executives'
df.loc[df['job_category'].isin(['First/Mid officials & Mgrs']), 'job_category'] = 'Managers'
df
df_pipo = df.groupby(['company']).agg(**{"'countInt_sum'": ('countInt', 'sum')}).reset_index()
df
df1 = pd.read_csv("Datasets/D2.csv")
df1['Female'] = df1.iloc[:,4]
df1['Male'] = df1.iloc[:,6]
df1['White'] = df1.iloc[:,9]
df1['Asian'] = df1.iloc[:,11]
df1['Latino'] = df1.iloc[:,13]
df1['Black'] = df1.iloc[:,15]
df1['Multi'] = df1.iloc[:,17]
df1['Other'] = df1.iloc[:,19]
df1 = df1.drop(columns=['%', '%.1', '%.2', '%.3', '%.4', '%.5', '%.6', '%.7', '%.8', 'Undeclared', 'Unnamed: 21'])
df1
missing_values_count = df.isnull().sum()
print (missing_values_count)
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)
racegroups = df.groupby(["race"])
a = racegroups.get_group("Asian")
w= racegroups.get_group("White")
b= racegroups.get_group("Black_or_African_American")
h= racegroups.get_group("Hispanic_or_Latino")
t= racegroups.get_group("Two_or_more_races")
hw = racegroups.get_group("Native_Hawaiian_or_Pacific_Islander")
ind= racegroups.get_group("American_Indian_Alaskan_Native")
na = a["countInt"].sum()
nw = w["countInt"].sum()
nb= b["countInt"].sum()
nh= h["countInt"].sum()
nt= t["countInt"].sum()
nhw= hw["countInt"].sum()
nind= ind["countInt"].sum()
companies = df.groupby(['company'])
print("Race group distributions in dataset")
labels = 'Asian', 'White', 'Black_or_African_American', 'Hispanic_or_Latino', 'Two_or_more_races', 'Native_Hawaiian_or_Pacific_Islander', ' '
sizes = [na,nw,nb,nh,nt,nhw,nind]
fig1, ax1 = mpl.subplots(figsize=(8, 8))
explode = (0,0,0,0,0,0.1,0)
ax1.pie(sizes, explode=explode,labels=labels, autopct ='%1.1f%%', shadow=False, startangle=90)
ax1.axis('equal')
mpl.show()
import plotly.express as px
## bar chart on companies
print("Race distributions per company")
fig = px.histogram(df, x='company', y='countInt', color='race')
fig.update_yaxes(title_text='number of employees')
fig
import plotly.express as px
print("Race Distributions by job category")
fig = px.histogram(df, x='countInt', y='job_category', color='race')
fig
#Latinos
from plotly.subplots import make_subplots
import plotly.graph_objects as go
dfLatino = df.loc[df["race"].isin(["Hispanic_or_Latino"])]
dfLatino = dfLatino.loc[~(dfLatino['job_category'].isin(['Totals']))]
#dfLatino
dfWhite= df.loc[df["race"].isin(["White"])]
dfWhite = dfWhite.loc[~(dfWhite['job_category'].isin(['Totals']))]
#dfWhite
dfBlack= df.loc[df["race"].isin(["Black_or_African_American"])]
dfBlack = dfBlack.loc[~(dfBlack['job_category'].isin(['Totals']))]
dfAsian= df.loc[df["race"].isin(["Asian"])]
dfAsian = dfAsian.loc[~(dfAsian['job_category'].isin(['Totals']))]
dfHawai= df.loc[df["race"].isin(["Native_Hawaiian_or_Pacific_Islander"])]
dfHawai = dfHawai.loc[~(dfHawai['job_category'].isin(['Totals']))]
dfIndian= df.loc[df["race"].isin(["American_Indian_Alaskan_Native"])]
dfIndian = dfIndian.loc[~(dfIndian['job_category'].isin(['Totals']))]
dfTwo= df.loc[df["race"].isin(["Two_or_more_races"])]
dfTwo = dfTwo.loc[~(dfTwo['job_category'].isin(['Totals']))]
fig = make_subplots(rows = 4,cols = 2, start_cell="bottom-left", subplot_titles = ("Latino", "White", "Black","Asian",
"Native Hawaiian", "American Indian", "Two or More races"))
fig.add_trace(
go.Bar(x= dfLatino.job_category, y=dfLatino.countInt), row= 1,col=1)
fig.add_trace(
go.Bar(x=dfWhite.job_category, y=dfWhite.countInt),row=1,col=2)
fig.add_trace(
go.Bar(x=dfBlack.job_category, y=dfBlack.countInt),row=2,col=1)
fig.add_trace(
go.Bar(x=dfAsian.job_category, y=dfAsian.countInt),row=2,col=2)
fig.add_trace(
go.Bar(x=dfHawai.job_category, y=dfHawai.countInt),row=3,col=1)
fig.add_trace(
go.Bar(x=dfIndian.job_category, y=dfIndian.countInt),row=3,col=2)
fig.add_trace(
go.Bar(x=dfTwo.job_category, y=dfTwo.countInt),row=4,col=1)
fig.update_layout(height = 1500, width = 900, title_text = "Job Categories by Race Groups")
fig.show()
print("Gender representation by race")
fig0 = px.histogram(df, x='race', y='countInt', color='gender', template='plotly_white')
fig0.update_yaxes(title_text='Number of employees')
fig0
## pie chart on genders
print("Gender distribution in dataset")
genders = df.groupby(["gender"])
male= genders.get_group("male")
female= genders.get_group("female")
nmale = male["countInt"].sum()
nfemale = female["countInt"].sum()
labels = 'male', 'female'
sizes = [nmale,nfemale]
fig1, ax1 = mpl.subplots(figsize=(6, 6))
explode = (0,0)
ax1.pie(sizes, explode=explode,labels=labels, autopct ='%1.1f%%', shadow=False, startangle=90)
ax1.axis('equal')
mpl.show()
print("numer of males:", nmale)
print("numer of females:", nfemale)
In order to achieve this a number of computations using Shannon-Wiener Diversity Index will be calculated and will generate flexible information that can be adjusted to other datasets even when the diversity data may differ to the peresented here. The formula can automatically adjust the Eveness measure for datasets that identify different ethnic groups. The formula used for this is the following:
\begin{equation}H = - \sum Pi(\ln Pi) \end{equation}Where Pi is the proportion of each "group" in the sample. Once we find H which is the shannon diversity index, we can calculate (E) Evenness which will normalize the index into a more interpretable measure which ranges from 0 to lowest level of equal distribution accross groups to 1 which is perfect equitable distribution accross all groups. Evenness can be calculated as follows:
\begin{equation}E_H = \frac{ H }{H_{max}} \end{equation}Where H max is logarithm of the total number of groups.
#Executives
df_execs = df.loc[df['job_category'].isin(['Executives'])]
tmp_groupby_df = df_execs.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()
#Managers
df_managers = df.loc[df['job_category'].isin(['Managers'])]
mtmp_groupby_df = df_managers.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()
#Professionals
df_professionals = df.loc[df['job_category'].isin(['Professionals'])]
ptmp_groupby_df = df_professionals.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()
#Other employees
df_others = df.loc[df['job_category'].isin(['Technicians', 'Sales workers', 'Administrative support', 'Craft workers', 'operatives', 'laborers and helpers', 'Service workers'])]
otmp_groupby_df = df_others.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()
#total employees
df_totals = df
ttmp_groupby_df = df_totals.groupby(['company']).agg(**{'countInt_sum': ('countInt', 'sum')}).reset_index()
# Calculating each categorie's entropy as shannon index
def Sgenerator(yourDF,tempDF):
yourDF["S"]=""
for item in tempDF.index:
for index in yourDF.index:
if yourDF.loc[index,'company'] == tempDF.loc[item,'company']:
yourDF.loc[index,'S'] = tempDF.loc[item,'countInt_sum']
def piGenerator(yourDF):
yourDF['pi'] = yourDF['countInt']/yourDF['S']
yourDF['lnPi'] =np.where(yourDF['pi'] !=0.0,np.log(yourDF['pi'].astype(np.float64)),0)
yourDF['piXlnPi'] = yourDF['pi'] * yourDF['lnPi']
#EXECUTIVES
Sgenerator(df_execs,tmp_groupby_df)
piGenerator(df_execs)
#MANAGERS
Sgenerator(df_managers,mtmp_groupby_df)
piGenerator(df_managers)
#PROFESSIONALS
Sgenerator(df_professionals,ptmp_groupby_df)
piGenerator(df_professionals)
#OTHER EMPLOYEES
Sgenerator(df_others,otmp_groupby_df)
piGenerator(df_others)
#TOTALEMPLOYEES
Sgenerator(df_totals,ttmp_groupby_df)
piGenerator(df_totals)
#grouping by gender
#EXECUTIVES
df_execs_gender = df_execs.groupby(['company', 'gender']).agg(**{'countInt': ('countInt', 'sum')}).reset_index()
Sgenerator(df_execs_gender,tmp_groupby_df)
piGenerator(df_execs_gender)
#MANAGERS
df_managers_gender = df_managers.groupby(['company', 'gender']).agg(**{'countInt': ('countInt', 'sum')}).reset_index()
Sgenerator(df_managers_gender,mtmp_groupby_df)
piGenerator(df_managers_gender)
#TOTAL
df_totals_gender = df_totals.groupby(['company', 'gender']).agg(**{'countInt': ('countInt', 'sum')}).reset_index()
Sgenerator(df_totals_gender,ttmp_groupby_df)
piGenerator(df_totals_gender)
#calculating H Hmax and Eveness into tables
def calcH(yourDF):
df_execs_num_cats = yourDF.groupby(['company']).agg(number_caegories=('job_category', 'size')).reset_index()
df_execs_num_cats['Hmax'] = np.log(df_execs_num_cats['number_caegories'])
yourDF_H = yourDF.groupby(['company']).agg(H=('piXlnPi', 'sum')).reset_index()
yourDF_H['Hmax'] = df_execs_num_cats['Hmax']
yourDF_H['H+'] = yourDF_H['H'] * -1
yourDF_H['E'] = yourDF_H['H+']/yourDF_H['Hmax']
yourDF_H = yourDF_H.reset_index()
return yourDF_H
#EXECUTIVES
df_execs_H = calcH(df_execs)
#MANAGERS
df_managers_H = calcH(df_managers)
#PROFESSIONALS
df_professionals_H = calcH(df_professionals)
#OTHER EMPLOYEES
df_others_H = calcH(df_others)
#TOTAL EMPLOYEES
df_totals_H = calcH(df_totals)
#calculating H, Hmas and Evenness into tables for gender data
def calcHgender(yourDF):
df_execs_num_cats = yourDF.groupby(['company']).agg(number_caegories=('gender', 'size')).reset_index()
df_execs_num_cats['Hmax'] = np.log(df_execs_num_cats['number_caegories'])
yourDF_H = yourDF.groupby(['company']).agg(H=('piXlnPi', 'sum')).reset_index()
yourDF_H['Hmax'] = df_execs_num_cats['Hmax']
yourDF_H['H+'] = yourDF_H['H'] * -1
yourDF_H['E'] = yourDF_H['H+']/yourDF_H['Hmax']
yourDF_H = yourDF_H.reset_index()
return yourDF_H
df_execs_gender_H = calcHgender(df_execs_gender)
df_managers_gender_H = calcHgender(df_managers_gender)
df_totals_gender_H = calcHgender(df_totals_gender)
x_comp = pd.DataFrame(columns=['company','year','Eexec','EManager','EProfessional','EOther','ETotal','GenderExec','GenderManagement','GenderTotal'])
x_comp = x_comp.reset_index()
x_comp['company'] = df_totals_H['company']
x_comp['year'] = 2016
x_comp['Eexec'] = df_execs_H['E']
x_comp['EManager'] = df_managers_H['E']
x_comp['EProfessional'] = df_professionals_H['E']
x_comp['EOther'] = df_others_H['E']
x_comp['ETotal'] = df_totals_H['E']
x_comp['GenderExec'] = df_execs_gender_H['E']
x_comp['GenderManagement'] = df_managers_gender_H['E']
x_comp['GenderTotal'] = df_totals_gender_H['E']
x_comp
import plotly.express as px
fig = px.line(x_comp.sort_values(by=['company'], ascending=[True]), x='company', y=['Eexec', 'EManager', 'EProfessional', 'EOther', 'ETotal'], line_dash_sequence=['dot'])
fig.update_layout(xaxis_rangeslider_visible=True)
fig
fig = px.bar(x_comp, x='company', y='GenderTotal', color='GenderTotal', color_continuous_scale='pinkyl')
fig
fig = px.scatter_3d(x_comp, x='company', y='GenderExec', z='Eexec', color='Eexec', title='Executive Level Diversity and Gender Matrix')
fig.update_yaxes(title_text='Gender Balance')
xaxis=dict(
tickvals=x_comp['company'],
tickmode = 'linear',
title='Companies',
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
)
fig.update_xaxes(dtick=6)
fig
fig = px.scatter_3d(x_comp, x='company', y='GenderManagement', z='EManager', color='EManager', title='Management Level Diversity and Gender Matrix')
fig.update_yaxes(title_text='Gender Balance')
fig.update_xaxes(dtick=6)
fig
Financial records from publicly traded firms are available through EDGAR database. The financial statements for the companies in our datasets have been extracted from EDGAR and analysed using the following methodologies. The financial analysis per company will include metrics for all areas of the business:
Working Capital Turnover: the result of \begin{equation}\frac{ Revenue }{average\, working \, capital}\end{equation} Total Assets Turnover: the result of \begin{equation}\frac{ Revenue }{average \, Fixed\,Assets}\end{equation}
Current ratio: the result of \begin{equation}\frac{ Current\,assets }{Current\,liabilities}\end{equation}
Solvency:
Debt-to-Assets: \begin{equation}\frac{ Total\,debt }{Total\,assets}\end{equation} Interest Coverage: \begin{equation}\frac{EBIT}{Interest\,payments}\end{equation}
Net profit margin: \begin{equation}\frac{ Net income }{Revenue}\end{equation} Operating margin: \begin{equation}\frac{ Operating profit }{Revenue}\end{equation} ROE: \begin{equation}\frac{ Net\, income }{Average\, total\,equity}\end{equation}
P/E: \begin{equation}\frac{ Price\,per\,share }{Earnings\,per\,share}\end{equation} P/BV: \begin{equation}\frac{ Price\,per\,share}{Book\,value\,per\,share}\end{equation}
Financial Leverage: \begin{equation}\frac{ Average\,total\,assets}{Total\,shareholders'\,equity}\end{equation}
Cash flow to revenue: \begin{equation}\frac{CFO}{Revenue}\end{equation} Debt payment: \begin{equation}\frac{CFO}{Cash\,paid\,for\,long\,term\,debt\,repayment}\end{equation} Investing & Financing: \begin{equation}\frac{CFO}{Cash\,outflows\,for\,investing\,and\,financing\,activities}\end{equation}
Financialsdf = pd.DataFrame(columns = ['company,','year','WCturnover','TAturnover','currentRatio','quickRatio',
'DebtToAssets','InterestCoverage', 'NetProfitMargin','operatingMargin','ROE','P/E',
'P/BV','FinancialLeverage','PerformanceRatio','CashflowToRevenue','DebtPayment','Investing&Financing'])
Financialsdf = Financialsdf.reset_index()
Financialsdf
df_financials = pd.read_excel("Datasets/Financial/AllCompanies_Financials.xlsx")
df_financials["WorkingCapitalTurnover"] = df_financials['Revenue']/df_financials['Working Capital Current']
df_financials["Asset Turnover"] = df_financials['Revenue']/df_financials['Total Assets']
df_financials["current ratio"] = df_financials['Current Assets']/df_financials['Current Liabilitites']
df_financials["Solvency"] = df_financials['Total Liabilities']/df_financials['Total Assets']
df_financials["Net Profit Margin"] = df_financials['Net Income']/df_financials['Revenue']
df_financials["operating margin"] = df_financials['Operating Profit']/df_financials['Revenue']
df_financials["ROE"] = df_financials['Net Income']/df_financials['Shareholders Equity']
df_financials["Leverage"] = df_financials['Total Assets']/df_financials['Shareholders Equity']
df_financials["Cashflow-Revenue"] = df_financials['CashFlow']/df_financials['Revenue']
df_financial_ratios = df_financials[['Company', 'Year', 'WorkingCapitalTurnover', 'Asset Turnover', 'current ratio', 'Solvency', 'Net Profit Margin', 'operating margin', 'ROE', 'Leverage', 'Cashflow-Revenue']]
df_financial_ratios
df_corr_explor = pd.merge(df_financial_ratios, x_comp, how='left', left_on=['Company'], right_on=['company'])
df_corr_explor = df_corr_explor.drop(columns=['year', 'company', 'index'])
df_corr_explor
import plotly.express as px
fig = px.scatter(df_corr_explor.dropna(subset=['current ratio']), x='Eexec', y='current ratio', symbol_sequence=['square-open'], color='Eexec', trendline='ols', hover_name='Company')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['current ratio']), x='EManager', y='current ratio', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['Net Profit Margin']), x='EManager', y='Net Profit Margin', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='EManager', y='operating margin', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='GenderManagement', y='operating margin', color='GenderManagement', symbol_sequence=['square-open'], trendline='ols',hover_name='Company', color_continuous_scale='pinkyl')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['Asset Turnover']), x='ETotal', y='Asset Turnover', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['Asset Turnover']), x='ETotal', y='Asset Turnover', color='EManager', symbol_sequence=['square-open'], trendline='ols',hover_name='Company')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['ROE']), x='ETotal', color='EManager', symbol_sequence=['square-open'], trendline='ols', y='ROE',hover_name='Company')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['Net Profit Margin']), x='GenderExec', color='EManager', symbol_sequence=['square'], trendline='ols', y='Net Profit Margin',hover_name='Company', color_continuous_scale='pinkyl')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='GenderExec', color='EManager', symbol_sequence=['square'], trendline='ols', y='operating margin',hover_name='Company', color_continuous_scale='pinkyl')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['operating margin']), x='GenderManagement', color='EManager', symbol_sequence=['square'], trendline='ols', y='operating margin',hover_name='Company', color_continuous_scale='pinkyl')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['Cashflow-Revenue']), x='GenderManagement', color='EManager', symbol_sequence=['square'], trendline='ols', y='Cashflow-Revenue',hover_name='Company', color_continuous_scale='pinkyl')
fig
fig = px.scatter(df_corr_explor.dropna(subset=['Cashflow-Revenue']), x='GenderTotal', color='EManager', symbol_sequence=['square'], trendline='ols', y='Cashflow-Revenue',hover_name='Company', color_continuous_scale='pinkyl')
fig